In [ ]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load the data
file_path = '/Users/lakshay/Desktop/graucia/loblaws.xlsx'
df = pd.read_excel(file_path, sheet_name='E Comm') # Load the E Comm sheet
In [ ]:
# Display the first few rows of the dataframe to understand its structure
print(df.head())
print(df.columns)
# Data Cleaning
# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True) # Fill numerical NaNs with median
df.fillna(df.mode().iloc[0], inplace=True) # Fill categorical NaNs with mode
CustomerID Churn Tenure PreferredLoginDevice CityTier WarehouseToHome \
0 50001 1 4.0 Mobile Phone 3 6.0
1 50002 1 NaN Phone 1 8.0
2 50003 1 NaN Phone 1 30.0
3 50004 1 0.0 Phone 3 15.0
4 50005 1 0.0 Phone 1 12.0
PreferredPaymentMode Gender HourSpendOnApp NumberOfDeviceRegistered \
0 Debit Card Female 3.0 3
1 UPI Male 3.0 4
2 Debit Card Male 2.0 4
3 Debit Card Male 2.0 4
4 CC Male NaN 3
PreferedOrderCat SatisfactionScore MaritalStatus NumberOfAddress \
0 Laptop & Accessory 2 Single 9
1 Mobile 3 Single 7
2 Mobile 3 Single 6
3 Laptop & Accessory 5 Single 8
4 Mobile 5 Single 3
Complain OrderAmountHikeFromlastYear CouponUsed OrderCount \
0 1 11.0 1.0 1.0
1 1 15.0 0.0 1.0
2 1 14.0 0.0 1.0
3 0 23.0 0.0 1.0
4 0 11.0 1.0 1.0
DaySinceLastOrder CashbackAmount
0 5.0 159.93
1 0.0 120.90
2 3.0 120.28
3 3.0 134.07
4 3.0 129.60
Index(['CustomerID', 'Churn', 'Tenure', 'PreferredLoginDevice', 'CityTier',
'WarehouseToHome', 'PreferredPaymentMode', 'Gender', 'HourSpendOnApp',
'NumberOfDeviceRegistered', 'PreferedOrderCat', 'SatisfactionScore',
'MaritalStatus', 'NumberOfAddress', 'Complain',
'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount',
'DaySinceLastOrder', 'CashbackAmount'],
dtype='object')
In [ ]:
# Ensure Churn is treated as a categorical variable
df['Churn'] = df['Churn'].astype(int)
# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_columns)
# Encode categorical columns
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in categorical_columns:
df[col] = label_encoder.fit_transform(df[col])
# Identify numerical columns
numerical_columns = df.select_dtypes(include=[np.number]).columns
print("Numerical columns:", numerical_columns)
Categorical columns: Index(['PreferredLoginDevice', 'PreferredPaymentMode', 'Gender',
'PreferedOrderCat', 'MaritalStatus'],
dtype='object')
Numerical columns: Index(['CustomerID', 'Churn', 'Tenure', 'PreferredLoginDevice', 'CityTier',
'WarehouseToHome', 'PreferredPaymentMode', 'Gender', 'HourSpendOnApp',
'NumberOfDeviceRegistered', 'PreferedOrderCat', 'SatisfactionScore',
'MaritalStatus', 'NumberOfAddress', 'Complain',
'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount',
'DaySinceLastOrder', 'CashbackAmount'],
dtype='object')
In [ ]:
# Descriptive Statistics
print(df.describe())
# Churn Distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution')
plt.show()
# Feature Distributions with respect to Churn
for column in df.columns:
if column != 'Churn':
plt.figure(figsize=(8, 6))
sns.histplot(data=df, x=column, hue='Churn', multiple='stack', kde=True)
plt.title(f'Distribution of {column} with respect to Churn')
plt.show()
CustomerID Churn Tenure PreferredLoginDevice \
count 5630.000000 5630.000000 5630.000000 5630.000000
mean 52815.500000 0.168384 10.134103 0.928419
std 1625.385339 0.374240 8.357951 0.709822
min 50001.000000 0.000000 0.000000 0.000000
25% 51408.250000 0.000000 3.000000 0.000000
50% 52815.500000 0.000000 9.000000 1.000000
75% 54222.750000 0.000000 15.000000 1.000000
max 55630.000000 1.000000 61.000000 2.000000
CityTier WarehouseToHome PreferredPaymentMode Gender \
count 5630.000000 5630.000000 5630.000000 5630.000000
mean 1.654707 15.566785 3.548135 0.601066
std 0.915389 8.345961 1.389659 0.489723
min 1.000000 5.000000 0.000000 0.000000
25% 1.000000 9.000000 3.000000 0.000000
50% 1.000000 14.000000 4.000000 1.000000
75% 3.000000 20.000000 4.000000 1.000000
max 3.000000 127.000000 6.000000 1.000000
HourSpendOnApp NumberOfDeviceRegistered PreferedOrderCat \
count 5630.000000 5630.000000 5630.000000
mean 2.934636 3.688988 2.369627
std 0.705528 1.023999 1.411435
min 0.000000 1.000000 0.000000
25% 2.000000 3.000000 2.000000
50% 3.000000 4.000000 2.000000
75% 3.000000 4.000000 4.000000
max 5.000000 6.000000 5.000000
SatisfactionScore MaritalStatus NumberOfAddress Complain \
count 5630.000000 5630.000000 5630.000000 5630.000000
mean 3.066785 1.168384 4.214032 0.284902
std 1.380194 0.664344 2.583586 0.451408
min 1.000000 0.000000 1.000000 0.000000
25% 2.000000 1.000000 2.000000 0.000000
50% 3.000000 1.000000 3.000000 0.000000
75% 4.000000 2.000000 6.000000 1.000000
max 5.000000 2.000000 22.000000 1.000000
OrderAmountHikeFromlastYear CouponUsed OrderCount \
count 5630.000000 5630.000000 5630.000000
mean 15.674600 1.716874 2.961812
std 3.591058 1.857640 2.879248
min 11.000000 0.000000 1.000000
25% 13.000000 1.000000 1.000000
50% 15.000000 1.000000 2.000000
75% 18.000000 2.000000 3.000000
max 26.000000 16.000000 16.000000
DaySinceLastOrder CashbackAmount
count 5630.000000 5630.000000
mean 4.459325 177.223030
std 3.570626 49.207036
min 0.000000 0.000000
25% 2.000000 145.770000
50% 3.000000 163.280000
75% 7.000000 196.392500
max 46.000000 324.990000
In [ ]:
# Correlation Matrix
plt.figure(figsize=(14, 12))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
# Pair Plots
sns.pairplot(df, hue='Churn', diag_kind='kde')
plt.show()
In [ ]:
# Further EDA based on feature importance
import shap
# Data Splitting
X = df.drop('Churn', axis=1)
y = df['Churn']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Model Training
from sklearn.ensemble import RandomForestClassifier
best_model = RandomForestClassifier()
best_model.fit(X_train, y_train)
Out[Â ]:
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [ ]:
# SHAP for Model Interpretation
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)
# Feature Importance from Random Forest
importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]
# Plot the feature importances
plt.figure(figsize=(12, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()